/*******************************************************************************

PooledRegions make_ViltoVil_ViltoMktCost

2018.09.12 (DJ) Created

This do file creates (i) all villlages to all villages travel costs.
					 (ii) all villages to all markets travel costs.

*******************************************************************************/




/*******************************************************************************
PART 1. ALL VILLAGES TO ALL VILLAGES TRAVEL COSTS.


	Distances Data construction by adding up:
	
	(i) origin village to origin market
	(ii) origin turnoff to dest turnoff
	(iii) dest market to dest village
*******************************************************************************/

	
	
	
// Step 0: Prepare Turnoff-Turnoff google distances data
	// Routes ACROSS regions: This data has only A->B. Append B->A for merge.
		// This is A->B
		import delimited "${google}/T_to_T_Manyara_to_Kili_fix_gps_finished.csv", clear // there was a problem for MNADA WA SUNYA, so we did it again.  
		tempfile data_fix
		save `data_fix'
		import delimited "${google}/T_to_T_Manyara_to_Kili_gps_finished.csv", clear 
		drop if market_dest=="MNADA WA SUNYA" // this was replced by <Manyara_to_Kili_fix_gps>
		keep market_* distance* duration*
		append using `data_fix'
		tempfile data1
		save `data1'
		
		// This is B->A
		use `data1', clear 
		keep market_* distance* duration*
		rename (market_origin market_dest) (market_dest market_origin)
		append using `data1'
		
		tempfile T_to_T_crossregion
		save `T_to_T_crossregion'
		
		
	// Routes WITHIN Manyara: This data has only A->B. Append B->A for merge.
		// This is A->B
		import delimited "${google}/T_to_T_Manyara_fix_gps_finished.csv", clear
		tempfile data_fix
		save `data_fix'
		import delimited "${google}/T_to_T_Manyara_final.csv", clear 
		drop if market_origin=="MNADA WA SUNYA" 
		drop if market_dest=="MNADA WA SUNYA" // this was replced by <T_to_T_Manyara_fix_gps_finished>
		keep market_* distance* duration*
		append using `data_fix'
		tempfile data1
		save `data1'
		
		// This is B->A
		use `data1', clear
		keep market_* distance* duration*
		rename (market_origin market_dest) (market_dest market_origin)
		append using `data1'
	
		tempfile T_to_T_Manyara
		save `T_to_T_Manyara'
		
	// Routes WITHIN Kilimanjaro: This data has both A->B and B->A.
		import delimited "${google}/turnoff_to_turnoff_final.csv", clear
		keep market_* distance* duration*
		append using `T_to_T_crossregion'
		append using `T_to_T_Manyara'
		
		tempfile T_to_T_Kili
		save `T_to_T_Kili'
		
	// Routes between the same markets/turnoff set as zero.
		keep market_origin
		duplicates drop
		gen market_dest = market_origin
		gen duration_hrs = 0
		gen distance_km = 0
		 
		append using `T_to_T_Kili'
		rename (market_origin market_dest) (primary_market_O primary_market_D)
		duplicates tag primary_market_O primary_market_D, gen(dup)
		
		drop if primary_market_O=="MNADA WA SUNYA" & mi(place_origin)
		tempfile T_to_T_all
		save `T_to_T_all'
	
	
// Step 1: Prepare Village-to-Village within Markets google distances data
	// Routes WITHIN Markets (Kilimanjaro): This data has both directions A->B and B->A and as well as A->A	
		import delimited "${google}/V_to_V_within_M_Kilimanjaro.csv", clear
		tempfile VtoVinM_Kili
		save `VtoVinM_Kili'
		
	// Routes WITHIN Markets (Manyara): This data has both directions A->B and B->A and as well as A->A		
		import delimited "${google}/V_to_V_within_M_Manyara_final.csv", clear 
		keep if place_origin =="village" & place_dest =="village"
		
		// the grid adjustment is only applicable to "km", so the hours must be set as missing
		replace duration_hrs = . if (origin_grid_adjustment>0 & origin_grid_adjustment!=.) | ///
									(destination_grid_adjustment>0 & destination_grid_adjustment!=.) 
		
		append using `VtoVinM_Kili'
		
		geodist lat_origin lon_origin lat_dest lon_dest, gen(arcdist_vil_vil_km)
		//br *village*origin* lat_origin lon_origin *village*dest*  lat_dest lon_dest distance_km arcdist_vil_vil_km if distance_km ==0 & arcdist_vil_vil_km !=0
		
		replace distance_km = arcdist_vil_vil_km if distance_km ==0 & (arcdist_vil_vil_km !=0 & arcdist_vil_vil_km!=.)
		//gen diff = distance_km - arcdist_vil_vil_km
		//br *village*origin* lat_origin lon_origin *village*dest*  lat_dest lon_dest distance_km arcdist_vil_vil_km
		
		keep district* ward* village* market* distance* duration*
		duplicates drop
		
		rename (district_origin ward_origin village_name_origin market_origin) (survey_district_O survey_ward_O survey_village_O primary_market_O)
		rename (district_dest ward_dest village_name_dest market_dest) (survey_district_D survey_ward_D survey_village_D primary_market_D)
 
		foreach var of varlist survey_district*  survey_village*{
			replace `var' = subinstr(`var', "_", " ",.)
		}
		tempfile VtoVinM
		save `VtoVinM'
		
	
/********************************************************** 
Step 2: Create all villages - all villages pairs

There are a series of imputations here:

RQ_vil_mkt_cost_USD
RQ_turnoff_mkt_km 
google_turnoff_turnoff_km 
RQ_turnoff_mkt_km 
RQ_mkt_vil_cost_USD

**********************************************************/

	// impute missing turnoff-to-turnoff travel costs
		//cross-regions 
		import delimited "${google}/T_to_T_Manyara_to_Kili_fix_gps_finished.csv", clear // there was a problem for MNADA WA SUNYA, so we did it again.  
		tempfile data_fix
		save `data_fix'
		import delimited "${google}/T_to_T_Manyara_to_Kili_gps_finished.csv", clear 
		drop if market_dest=="MNADA WA SUNYA" // this was replced by <Manyara_to_Kili_fix_gps>
		keep place_origin place_dest lat* lon* distance* duration*
		append using `data_fix'
		tempfile t_to_t_cross
		save `t_to_t_cross'
		
		
		//within Manyara: This is A->B
		import delimited "${google}/T_to_T_Manyara_fix_gps_finished.csv", clear
		tempfile data_fix
		save `data_fix'
		import delimited "${google}/T_to_T_Manyara_final.csv", clear 
		drop if market_origin=="MNADA WA SUNYA" 
		drop if market_dest=="MNADA WA SUNYA" // this was replced by <T_to_T_Manyara_fix_gps_finished>
		keep place_origin place_dest lat* lon* distance* duration*
		append using `data_fix'
		tempfile t_to_t_manyara
		save `t_to_t_manyara'
		
		// within Kilimanjaro both A->B and B->A; convert this into A->B
			import delimited "${google}/turnoff_to_turnoff_final.csv", clear
			tempfile google_t_to_t
			save `google_t_to_t'
			drop if mi(rqa_turnoff_latitude_origin)
			keep market_origin rqa_turnoff_latitude_origin rqa_turnoff_longitude_origin 
			duplicates drop
			egen turnoff_id_origin = group(market_origin rqa_turnoff_latitude_origin rqa_turnoff_longitude_origin)
			gen join=1
			tempfile temp1
			save `temp1'
			
			rename turnoff_id_origin turnoff_id_dest
			rename market_origin market_dest
			rename rqa_turnoff_longitude_origin rqa_turnoff_longitude_dest
			rename rqa_turnoff_latitude_origin rqa_turnoff_latitude_dest
			
			/* create all possible unique pairs using joinby */
			joinby join using `temp1'
			drop if turnoff_id_origin==turnoff_id_dest
			egen d1=concat(turnoff_id_origin turnoff_id_dest)         /* create joint pairs in order 1   */
			egen d2=concat(turnoff_id_dest turnoff_id_origin)         /* create joint pairs in order 2   */
			replace d1=d2 if turnoff_id_origin>turnoff_id_dest        /* d1 has pairs in ascending order */
			sort d1 turnoff_id_origin
			duplicates drop d1, force
		
		// bring in distances 
			merge 1:1 market_origin rqa_turnoff_latitude_origin rqa_turnoff_longitude_origin market_dest rqa_turnoff_latitude_dest rqa_turnoff_longitude_dest using `google_t_to_t'
			drop if _merge==2
			append using `t_to_t_cross'
			append using `t_to_t_manyara'
		
		// run regression to collect coefficients
			geodist rqa_turnoff_latitude_origin rqa_turnoff_longitude_origin rqa_turnoff_latitude_dest rqa_turnoff_longitude_dest, gen(arcdist_t_t_km)
			gen ln_google_t_t_km = ln(distance_km)
			gen ln_arcdist_t_t_km = ln(arcdist_t_t_km)
			reg ln_google_t_t_km ln_arcdist_t_t_km
			local cons_t_t = _b[_cons]
			local coef_t_t = _b[ln_arcdist_t_t_km]
		
		
	// set up origin villages	

		// produce coefficients for market to turnoff regression using market-level data
			import delimited "${google}/mkt_turnoff_1_to_1_gps_google.csv", clear
			
			rename total_distance google_mkt_turnoff_km
			rename total_duration google_mkt_turnoff_hours
			
			geodist turnoff_lat turnoff_lon market_lat market_lon, gen(arcdist_mkt_turnoff_km)
			gen ln_arcdist_mkt_turnoff_km = ln(arcdist_mkt_turnoff_km)
			gen ln_google_mkt_turnoff_km = ln(google_mkt_turnoff_km)
			
			reg ln_google_mkt_turnoff_km ln_arcdist_mkt_turnoff_km
			local cons_mkt_t = _b[_cons]
			local coef_mkt_t = _b[ln_arcdist_mkt_turnoff_km]
			
			tempfile google_mkt_turnoff
			save `google_mkt_turnoff'
			
		// imputation for i) mkt to turnoff and (ii) vil to mkt rural travel costs
			use "${analysis}/Kilimanjaro rq_census_mt_vt_google.dta", clear
			rename  (district ward village_name market) (survey_district survey_ward survey_village primary_market)
			append using "${analysis}/Manyara rq_census_mt_vt_google.dta"	
			foreach var in USD USD_rt{
				replace RQ_mkt_vil_cost_`var' = RQ_vil_mkt_cost_`var' if mi(RQ_mkt_vil_cost_`var')
				replace RQ_vil_mkt_cost_`var' = RQ_mkt_vil_cost_`var' if mi(RQ_vil_mkt_cost_`var')
			}	
			
			merge m:1 survey_region primary_market using `google_mkt_turnoff', nogen
			
			// impute market to turnoff travel km
				cap drop arcdist_mkt_turnoff_km ln_arcdist_mkt_turnoff_km
				geodist turnoff_lat turnoff_lon market_lat market_lon, gen(arcdist_mkt_turnoff_km)
				gen ln_arcdist_mkt_turnoff_km = ln(arcdist_mkt_turnoff_km)
				
				gen yhat = `cons_mkt_t' + `coef_mkt_t'*ln_arcdist_mkt_turnoff_km
				
				replace google_mkt_turnoff_km = exp(yhat) 	if mi(google_mkt_turnoff_km)
				replace google_mkt_turnoff_km = 0 			if arcdist_mkt_turnoff_km==0
				cap drop yhat
			
			
			// impute market to village (or village to market)
				geodist village_lat village_lon market_lat market_lon, gen(arcdist_vil_mkt_km)
				
				gen ln_google_vil_mkt_km = ln(google_vil_mkt_km)
				gen ln_arcdist_vil_mkt_km = ln(arcdist_vil_mkt_km)
				reg ln_google_vil_mkt_km ln_arcdist_vil_mkt_km
				local cons_vil_mkt = _b[_cons]
				local coef_vil_mkt = _b[ln_arcdist_vil_mkt_km]
				
				predict yhat
				
				replace google_vil_mkt_km 	= exp(yhat) 			if mi(google_vil_mkt_km)
				replace google_vil_mkt_km 	= 0 					if arcdist_vil_mkt_km==0
			
			// replace costs using the imputed distances and cost/km
			replace RQ_mkt_vil_cost_USD = google_vil_mkt_km*${ruralcost_perkm} if mi(RQ_mkt_vil_cost_USD)
			replace RQ_vil_mkt_cost_USD = google_vil_mkt_km*${ruralcost_perkm} if mi(RQ_vil_mkt_cost_USD)
			
			drop yhat
			
			tempfile originvillageset
			save `originvillageset'
	
// create village-to-village level data using joinby
		cap drop google_vil_nearcity_nobabati_km
		keep survey_* primary_market google* RQ_mkt* RQ_vil* RQ_turnoff* village_lat village_lon turnoff_lat turnoff_lon market_lat market_lon arcdist*
		rename * *_O
		gen join=1
		tempfile origin
		save `origin'
		
		use `originvillageset'
		keep survey_* primary_market google* RQ_mkt* RQ_vil* RQ_turnoff* village_lat village_lon turnoff_lat turnoff_lon market_lat market_lon arcdist*
		cap drop google_vil_nearcity_nobabati_km
		rename * *_D
		gen join=1
		joinby join using `origin'
		drop join
	
	
// Step 3: Bring in <Turnoff to Turnoff Distances> and <Village to Village Distances within Market>
	merge m:1 primary_market_D primary_market_O using `T_to_T_all'
	drop if _merge==2
	drop _merge
	rename duration_hrs google_turnoff_turnoff_hrs
	rename distance_km google_turnoff_turnoff_km
	
	
	merge m:1 survey_district_O survey_ward_O survey_village_O primary_market_O ///
			  survey_district_D survey_ward_D survey_village_D primary_market_D using `VtoVinM'
	drop if _merge==2
	drop _merge
	rename duration_hrs google_vil_vil_inM_hrs
	rename distance_km google_vil_vil_inM_km
	
	order survey_r*O survey_d*O survey_w*O survey_v*O primary_market_O ///
		  survey_r*D survey_d*D survey_w*D survey_v*D primary_market_D  
	sort survey_r*O survey_d*O survey_w*O survey_v*O primary_market_O ///
		 survey_r*D survey_d*D survey_w*D survey_v*D primary_market_D  
	
	geodist  turnoff_lat_O turnoff_lon_O turnoff_lat_D turnoff_lon_D, gen(arcdist_t_t_km)
	gen ln_arcdist_t_t_km = ln(arcdist_t_t_km)
		
	//  use the coefficients from above from turnoff-turnoff routes
		gen yhat = `cons_t_t'+`coef_t_t'*ln_arcdist_t_t_km
		
		replace google_turnoff_turnoff_km = exp(yhat) 			if mi(google_turnoff_turnoff_km)
		replace google_turnoff_turnoff_km = 0 					if arcdist_t_t_km==0
	
	
// Step (4): construct KM and COSTS by adding up each segment. 
							
	// cleaning distances at the segment level
		//keep survey* primary* google_vil_mkt_km_? ///
		//	 google_mkt_turnoff_km_* google_turnoff_turnoff* arcdist_vil_mkt* arcdist_t_t_km
	
		
		local google1 google_vil_mkt_km_O
		local google2 google_mkt_turnoff_km_O
		local google3 google_turnoff_turnoff_km
		local google4 google_mkt_turnoff_km_D
		local google5 google_vil_mkt_km_D
		
		local geodist1 arcdist_vil_mkt_km_O
		local geodist2 arcdist_mkt_turnoff_km_O
		local geodist3 arcdist_t_t_km
		local geodist4 arcdist_mkt_turnoff_km_D
		local geodist5 arcdist_vil_mkt_km_D
		
		forvalues i=1/5{
			replace `google`i'' = 0 if `geodist`i''==0
			
			gen perc_diff = (`google`i'' - `geodist`i'')/`geodist`i''
			sum perc_diff, d
			
			// replace google distances by 95th or 5th percentiles of arc distances
			replace `google`i'' = `geodist`i''*(r(p95)+1) if perc_diff>=r(p95) & !mi(`google`i'')
			replace `google`i'' = `geodist`i''*(r(p5)+1) if perc_diff<=r(p5)
			
			cap drop perc_diff
			
			gen perc_diff = (`google`i'' - `geodist`i'')/`geodist`i''
			sum perc_diff
			cap drop perc_diff
			}
		
		fsum google_vil_mkt_km_O google_mkt_turnoff_km_O google_turnoff_turnoff_km google_mkt_turnoff_km_D google_vil_mkt_km_D
		
	// generate routes where we assume it goes through the primary markets
		gen DIST_km		= google_vil_mkt_km_O + google_mkt_turnoff_km_O + google_turnoff_turnoff_km ///
						+ google_mkt_turnoff_km_D + google_vil_mkt_km_D
						
		gen DIST_cost_USD	= RQ_vil_mkt_cost_USD_O + google_mkt_turnoff_km_O*${pavedcost_perkm} + google_turnoff_turnoff_km*${pavedcost_perkm} ///
							+ google_mkt_turnoff_km_D*${pavedcost_perkm} + RQ_mkt_vil_cost_USD_D
					
	// this is the route without going through markets (when the route is shorter)
		gen DIST_km_direct = DIST_km
		replace DIST_km_direct = google_vil_vil_inM_km if google_vil_vil_inM_km < DIST_km & ///
														 (!mi(google_vil_vil_inM_km) & !mi(DIST_km))
						
		gen DIST_cost_USD_direct = DIST_cost_USD
		replace DIST_cost_USD_direct = google_vil_vil_inM_km*${ruralcost_perkm} if google_vil_vil_inM_km < DIST_km & (!mi(google_vil_vil_inM_km) & !mi(DIST_km))

		
		geodist village_lat_O village_lon_O village_lat_D village_lon_D, gen(arcdist_vil_vil_km)
		replace arcdist_vil_vil_km = 0 if survey_village_O==survey_village_D & survey_district_O == survey_district_D & survey_ward_O == survey_ward_D & primary_market_O == primary_market_D
		gen perc_diff = (DIST_km_direct - arcdist_vil_vil_km)/arcdist_vil_vil_km
		sum perc_diff, d
		br DIST_km_direct arcdist_vil_vil_km perc_diff google_vil_mkt_km_O google_mkt_turnoff_km_O google_turnoff_turnoff_km google_mkt_turnoff_km_D google_vil_mkt_km_D if perc_diff>12 & !mi(perc_diff)
		sort perc_diff
		
	// share of rural km and rural cost: this is 1 when the route does not go through markets																			
		gen rural_km_share = (google_vil_mkt_km_O+google_vil_mkt_km_D)/DIST_km
		replace rural_km_share = 1 if google_vil_vil_inM_km < DIST_km & (!mi(google_vil_vil_inM_km) & !mi(DIST_km))
		
		gen rural_cost_share = (RQ_vil_mkt_cost_USD_O+RQ_mkt_vil_cost_USD_D)/DIST_cost_USD
		replace rural_cost_share = 1 if google_vil_vil_inM_km < DIST_km & (!mi(google_vil_vil_inM_km) & !mi(DIST_km))
		
	
		keep survey* primary* DIST_* rural_km_share rural_cost_share google_vil_mkt_km_? ///
		google_mkt_turnoff_km_* google_turnoff_turnoff* arcdist_vil_mkt* arcdist_t_t_km arcdist_vil_vil_km
		
	tempfile ViltoMkt_KMCOST
	save `ViltoMkt_KMCOST'
	
	
